Purpose

This markdown will be used to validate stratification BED files for the v3.0-stratification release. Validation includes:

  1. Confirm expected chromosomes are covered and coverage of individual chromosomes is as expected
  2. Confirm coverage between GRCh37 and GRCh38 is similar or as expected.

Coverage Plots

library(tidyverse)
library(here)
## create vector of file paths to be used for dataframes
beds38 <- list.files(here("v3.0-stratifications/GRCh38"),
                     full.names = TRUE,
                     pattern = ".bed.gz")
beds37 <- list.files(here("v3.0-stratifications/GRCh37"),
                     full.names = TRUE,
                     pattern = ".bed.gz")

## Create dataframe of file data using list files
beds38_set <- set_names(beds38)
beds37_set <- set_names(beds37)
beds38_df <- map_dfr(.x= beds38_set,
                     read_tsv, 
                     skip =1,
                     col_types = "cdd",
                     col_names= c("chrom","start","end"),
                     .id = "stratification_filename") %>%
#change stratification column to just the base name of file, removing path  
  mutate(stratification_filename = basename(stratification_filename))

beds37_df <- map_dfr(.x= beds37_set,
                     read_tsv, 
                     skip = 1,
                     col_names= c("chrom","start","end"),
                     col_types = "cdd",
                     .id = "stratification_filename") %>%
#change stratification column to just the base name of file, removing path
  mutate(stratification_filename = basename(stratification_filename))

#stratfile_all_summary <- here("v3.0-stratfications-info.csv") %>%
stratfile_all_summary <- here("v3.0-stratfications-info.csv") %>%
  read.csv(header = TRUE, na.strings = TRUE) 

#strat38_summary <- here("v3.0-stratfications-info.csv") %>% 
strat38_summary <- here("v3.0-stratfications-info.csv") %>% 
  read.csv(header = TRUE, na.strings = TRUE)  %>% 
  filter(reference =="GRCh38") 

#strat37_summary <- here("v3.0-stratfications-info.csv") %>%
strat37_summary <- here("v3.0-stratfications-info.csv") %>%
  read.csv(header = TRUE, na.strings = TRUE) %>% 
  filter(reference =="GRCh37")

beds38_df_with_levels <- left_join(beds38_df,strat38_summary) 

beds37_df_with_levels <- left_join(beds37_df,strat37_summary) 

## Chromosome Presence/ Absence  #####################################
###########################################################################
## vector of chromosomes we expect to have

chroms <- c(1:22, "X", "Y")
expected_chroms <- factor(c(paste0("chr", chroms), chroms),
                          levels = c(paste0("chr", chroms), chroms))

## dataframes with Expected and Unexpected chroms for plotting
expected_chroms_38 <- beds38_df_with_levels %>%
  filter(chrom %in% expected_chroms)
unexpected_chroms_38 <- beds38_df_with_levels %>%
  filter(!chrom %in% expected_chroms)

expected_chroms_37 <- beds37_df_with_levels %>%
  filter(chrom %in% expected_chroms)
unexpected_chroms_37 <- beds37_df_with_levels %>%
  filter(!chrom %in% expected_chroms)

## Get Chromosome Sizes -- alternate method using UCSC Ns from Aaron script
###########################################################################
#read in genome BEDs that have had Ns and PSA regions for Y removed
cols <- c("chrom","start","end")

chromregions_37 <- read_tsv("human.b37.1_22XY.genome.sorted_noNs_noPSAY_merged.bed",
                            col_names = cols) %>%
  mutate(region_size = end - start)

chromregions_38 <- read_tsv("human.hg38.1_22XY.genome.sorted_noNs_noPSAY_merged.bed",
                            col_names = cols)%>%
  mutate(region_size = end - start)

chromsizes_37 <- chromregions_37 %>%
  group_by(chrom) %>%
  summarise(nonN_ref_len = sum(region_size))
chromsizes_37$chrom <-
  paste0("chr", chromsizes_37$chrom)

chromsizes_38 <- chromregions_38 %>%
  group_by(chrom) %>%
  summarise(nonN_ref_len = sum(region_size))

## Create Dataframes for plotting chromosome sizes ########################
###########################################################################

## Add column to table of 37 and 38  to calculate region sizes
beds37_df_with_levels <- mutate(beds37_df_with_levels,
                                              region_size = end - start)

beds38_df_with_levels <- mutate(beds38_df_with_levels,
                                              region_size = end - start)

#Sum of all regions by chromosome
chrom_region_sizes_37stratifications <- beds37_df_with_levels %>%
  group_by(stratification_filename, chrom) %>%
  summarise(total_region = sum(region_size))

chrom_region_sizes_38stratifications <- beds38_df_with_levels %>%
  group_by(stratification_filename, chrom) %>%
  summarise(total_region = sum(region_size))

## append "chr" prefix to chroms in GRCh37 df to reduce number of fields when
## joined with 38. Found there were some with "chr" already appended therefore
## needed to remove all chr first then paste chr to ensure single chr prefix

chrom_region_sizes_37stratifications$chrom <-
  str_remove_all(chrom_region_sizes_37stratifications$chrom, "chr")
chrom_region_sizes_37stratifications$chrom <-
  paste0("chr", chrom_region_sizes_37stratifications$chrom)

## combine ref and strat region size dataframes
## total_region=sum of stratification chrom regions size (use for plots),
## Non_N=ref chrom size w/o Ns (use for plots), len=total length of chrom regions

stratANDref_37chromsizes <- left_join(chrom_region_sizes_37stratifications,
                                      chromsizes_37)
stratANDref_37chromsizes <- add_column(stratANDref_37chromsizes,
                                       "reference" = "GRCh37")

stratANDref_38chromsizes <- left_join(chrom_region_sizes_38stratifications,
                                      chromsizes_38)
stratANDref_38chromsizes <- add_column(stratANDref_38chromsizes,
                                       "reference" = "GRCh38")

## Create table with all stratification and reference chrom sizes and
## stratification types
full <- full_join(stratANDref_37chromsizes,stratANDref_38chromsizes)

summary_for_plots <- full %>% left_join(stratfile_all_summary)
summary_for_plots <- summary_for_plots  %>% mutate(coverage = total_region/nonN_ref_len)
chrom_order <- paste0("chr",c(1:22, "X","Y"))
summary_for_plots$chrom <- factor(summary_for_plots$chrom, levels = c(chrom_order))

## Coverage Plots #########################################################
###########################################################################
coverage_plot <- function(strat_type_df, strat_type){

  
    ggplot(strat_type_df,
         aes(x=chrom, y=coverage, group = reference)) +
    geom_col(aes(fill = reference), position = "dodge") +
    facet_grid(scales = "free_y", stratification_level ~ .) +
    theme(legend.position = "top",
          text = element_text(size=30),
          axis.text.x = element_text(angle=90),
          strip.text.y = element_text(angle = 0),
          panel.spacing.y = unit(2, "lines"))
}

#Generate plots for desired stratification types (code below)

Check for stratification files for unexpected chromosomes

Expected chromosomes are chr1-22, X and Y.

#if unexpected DF rows are empty, report no unexpected chroms found
if ((dim(unexpected_chroms_38)[1]) == 0) {
  print("No unexpected chromosomes found for GRCh38 stratifications.")
}else{
  print("Unexpected chromosomes found for GRCh38")
  unexpected_chroms_38
}
## [1] "No unexpected chromosomes found for GRCh38 stratifications."
if ((dim(unexpected_chroms_37)[1]) == 0) {
  print("No unexpected chromosomes found for GRCh37 stratifications.")
}else{
  print("Unexpected chromosomes found for GRCh37")
  unexpected_chroms_37
}
## [1] "No unexpected chromosomes found for GRCh37 stratifications."

Low Complexity

Regions with different types and sizes of low complexity sequence (e.g., homopolymers, STRs, VNTRs, other locally repetitive sequences).

# summary_for_plots %>%
#   filter(stratification_type == "LowComplexity") %>%
#   coverage_plot(strat_type = "LowComplexity") 
LowComplexity <-filter(summary_for_plots, stratification_type == "LowComplexity")
  ggplot(LowComplexity, aes(x=chrom, y=coverage, group = reference)) +
  geom_col(aes(fill = reference), position = "dodge") +
    facet_grid(scales = "free", stratification_level ~ .) +
    theme(legend.position = "top",
          text = element_text(size=30),
          axis.text.x = element_text(angle=90),
          strip.text.y = element_text(angle = 0))

Functional Technically Difficult

Different functional, or potentially functional, regions that are also likely to be technically difficult to sequence.

summary_for_plots %>%
  filter(stratification_type == "FunctionalTechnicallyDifficult") %>%
  coverage_plot(strat_type = "FunctionalTechnicallyDifficult")

Genome Specific (benchmark v4.2.1)

Difficult regions due to potentially difficult variation in the NIST/GIAB sample, including (1) regions containing putative compound heterozygous variants, (2) regions containing multiple variants within 50 bp of each other, (3) regions with potential structural variation and copy number variation.

GSpecific <-filter(summary_for_plots, stratification_type == "GenomeSpecific")
ggplot(GSpecific, aes(x=chrom, y=coverage, group = reference)) +
  geom_col(aes(fill = reference ), position = "dodge") +
  facet_grid(stratification_level ~ genome, scales = "free") +
  theme(axis.text.x = element_text(angle = 90),
        legend.position = "top",
        strip.text.y = element_text(angle = 0))

Functional Regions

Regions to stratify variants inside and outside of coding regions.

summary_for_plots %>%
  filter(stratification_type == "FunctionalRegions") %>%
  coverage_plot(strat_type = "FunctionalRegions")

GC Content

Regions with different ranges(%) of GC content.

summary_for_plots %>%
  filter(stratification_type == "GCcontent") %>%
  coverage_plot(strat_type = "GCcontent")

Mappability

Regions where short read mappability can be challenging.

summary_for_plots %>%
  filter(stratification_type == "mappability") %>%
  coverage_plot(strat_type = "mappability")

Other Difficult

Highly variable regions like the VDJ and MHC, near gaps in the reference, or errors in the reference.

summary_for_plots %>%
  filter(stratification_type == "OtherDifficult") %>%
  coverage_plot(strat_type = "OtherDifficult")

Segmental Duplications

Regions with segmental duplications or regions with non-trivial self-chain alignments.

summary_for_plots %>%
  filter(stratification_type == "SegmentalDuplications") %>%
  coverage_plot(strat_type = "SegmentalDuplications")

Union

Regions with different general types of difficult regions or any type of difficult region or complex variant. For example, performance can be measured in just the “easy” regions of the genome.

summary_for_plots %>%
  filter(stratification_type == "union") %>%
  coverage_plot(strat_type = "union")

Ancestry

Regions with inferred patterns of local ancestry in GRCh38.

summary_for_plots %>%
  filter(stratification_type == "ancestry") %>%
  coverage_plot(strat_type = "ancestry")

Session Information

System Information

library(sessioninfo)
sessioninfo::platform_info()
##  setting  value                       
##  version  R version 3.6.3 (2020-02-29)
##  os       macOS  10.16                
##  system   x86_64, darwin15.6.0        
##  ui       X11                         
##  language (EN)                        
##  collate  en_US.UTF-8                 
##  ctype    en_US.UTF-8                 
##  tz       America/New_York            
##  date     2021-10-28

Package Versions

sessioninfo::package_info() %>%
    filter(attached == TRUE) %>%
    select(package, loadedversion, date, source)

```